import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display
import warnings
from plotly.subplots import make_subplots
warnings.filterwarnings('ignore')
print(f"Numpy version: {np.__version__}")
# Download latest version
path = kagglehub.dataset_download("muhammadroshaanriaz/students-performance-dataset-cleaned")
print("Path to dataset files:", path)
os.listdir(path)
data = pd.read_csv(path + "/Cleaned_Students_Performance.csv")
data.info()
data.head(5)
nans_count = data.isnull().sum()
print(nans_count)
No need for data imputation as there are no missing values.
Features (x)
Gender: Useful for analyzing performance differences between male and female students.
Race/Ethnicity: Allows analysis of academic performance trends across different racial or ethnic groups.
Parental Level of Education: Indicates the educational background of the student's family.
Lunch: Shows whether students receive a free or reduced lunch, which is often a socioeconomic indicator.
Test Preparation Course: This tells whether students completed a test prep course, which could impact their performance.
Variables of Interest (y)
Math Score: Provides a measure of each student’s performance in math, used to calculate averages or trends across various demographics.
Reading Score: Measures performance in reading, allowing for insights into literacy and comprehension levels among students.
Writing Score: Evaluates students' writing skills, which can be analyzed to assess overall literacy and expression.
Total Score: Cumulative Score at Reading, Maths and Writing
Initial view at the data
# Show the data metrics
data.describe(include='all')
#Computing the means, medians and modes of the data
# Numerical data
print(f"Numerical data Metrics:")
display(data.drop(columns=["gender","lunch","test_preparation_course"]).describe())
print(f"Categorical data Metrics:")
display(data.drop(columns=["math_score","reading_score","writing_score","total_score","average_score"]).astype("category").describe())
# Pair Plot for Numerical data
plt.figure(figsize=(20, 10))
sns.pairplot(data=data,vars=["math_score","reading_score","writing_score","average_score"],diag_kind="auto")
plt.show()
gender_counts = data["gender"].map({0: "male", 1: "female"}).value_counts()
gendet_counts_labels = gender_counts.index
lunch_counts = data["lunch"].map({0: "standard", 1: "free/reduced"}).value_counts()
lunch_labels = lunch_counts.index
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=gendet_counts_labels, values=gender_counts, name="Gender",textfont=dict(size=25),marker_colors=['#000042', '#A4303F']),
1, 1)
fig.add_trace(go.Pie(labels=lunch_labels, values=lunch_counts, name="Lunch",textfont=dict(size=25),marker_colors=['#F5A300','#177E89']),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")
fig.update_layout(
# Add annotations in the center of the donut pies.
annotations=[dict(text='Gender', x=sum(fig.get_subplot(1, 1).x) / 2, y=0.5,
font_size=30, showarrow=False, xanchor="center"),
dict(text='Lunch', x=sum(fig.get_subplot(1, 2).x) / 2, y=0.5,
font_size=30, showarrow=False, xanchor="center")],
legend=dict(font=dict(size=25),title=dict(text="Populations")),
title=dict(text="Categorial Data Populations",font=dict(size=30)))
fig.show()
edu_counts = data["parental_level_of_education"].value_counts()
edu_labels = data["parental_level_of_education"].value_counts().index
ethnicity_scores = data["race_ethnicity"].value_counts()
ethnicity_labels = ethnicity_scores.index
color_discrete_sequence_2=["#C3F73A","#306B34","#EF065B","#64A7CE","#5716A2",]
color_discrete_sequence_1=[
px.colors.qualitative.Dark2[0],
px.colors.qualitative.Dark24[7],
px.colors.qualitative.Dark2[2],
px.colors.qualitative.Dark2[6],
px.colors.qualitative.Safe[1],
px.colors.qualitative.Alphabet[13],]
fig2 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig2.add_trace(go.Pie(labels=edu_labels, values=edu_counts, name="Parent Edu",textfont=dict(size=25),marker_colors=color_discrete_sequence_1),
1, 1)
fig2.add_trace(go.Pie(labels=ethnicity_labels, values=ethnicity_scores, name="Ethnicity",textfont=dict(size=25),marker_colors=color_discrete_sequence_2),
1, 2)
fig2.update_traces(hole=.6, hoverinfo="label+percent+name")
fig2.update_layout(
# Add annotations in the center of the donut pies.
annotations=[dict(text=f'Parental<br>Education', x=sum(fig.get_subplot(1, 1).x) / 2, y=0.5,
font_size=30, showarrow=False, xanchor="center"),
dict(text='Ethnicity', x=sum(fig.get_subplot(1, 2).x) / 2, y=0.5,
font_size=30, showarrow=False, xanchor="center")],
legend=dict(font=dict(size=12),title=dict(text="Populations")),
title=dict(text="",font=dict(size=30)))
fig2.show()
test_list = ["math_score","reading_score","writing_score"]
# Creating an average score column
data["average_score"] = data[test_list].mean(axis=1)
test_list.append("average_score")
# Convert the data into long format
genderwise_scores = pd.melt(data, id_vars=["gender"],
value_vars=test_list,
var_name="Subject", value_name="Score")
# Replace gender values with labels
genderwise_scores["gender"] = genderwise_scores["gender"].map({1: "Male", 0: "Female"})
# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x="Subject", y="Score", hue="gender", data=genderwise_scores, palette=["blue", "pink"])
# Adding titles and labels
plt.title("Scores by Gender in Different Subjects")
plt.xlabel("Subject")
plt.ylabel("Score")
# Display the plot
plt.show()
lunchwise_scores = pd.melt(data, id_vars=["lunch"],
value_vars=test_list,
var_name="Subject", value_name="Score")
lunchwise_scores["lunch"] = lunchwise_scores["lunch"].map({1: "Standard", 0: "Free/Reduced"})
plt.figure(figsize=(10, 6))
sns.boxplot(x="Subject", y="Score", hue="lunch", data=lunchwise_scores, palette=["green", "red"])
plt.title("Scores by Lunch Type in Different Subjects")
plt.xlabel("Subject")
plt.ylabel("Score")
plt.show()
testprepwise_scores = pd.melt(data, id_vars=["test_preparation_course"],
value_vars=test_list,var_name="Subject",value_name="Score")
testprepwise_scores["test_preparation_course"] = testprepwise_scores["test_preparation_course"].map({1:"Taken",0:"Not Taken"})
plt.figure(figsize=(10,6))
sns.boxplot(x="Subject",y="Score",hue="test_preparation_course",data=testprepwise_scores,palette=["orange","purple"])
plt.title("Scores by Test Preparation Course in Different Subjects")
plt.xlabel("Subject")
plt.ylabel("Score")
plt.show()
# Get sorted unique labels from the race_ethnicity column
race_labels = sorted(data["race_ethnicity"].unique().tolist())
racewise_scores = pd.melt(data, id_vars=["race_ethnicity"],
value_vars=test_list,
var_name="Subject", value_name="Score")
# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x="Subject", y="Score", hue="race_ethnicity", data=racewise_scores, hue_order=race_labels)
plt.title("Race-wise Scores in Different Subjects")
plt.xlabel("Subject")
plt.ylabel("Score")
plt.legend(title="Ethnicity",ncols=5)
plt.show()
parent_edu_labels = data["parental_level_of_education"].unique().tolist()
parent_edu_scores = pd.melt(data,id_vars=["parental_level_of_education"],value_vars=test_list,var_name="Subject",value_name="Score")
plt.figure(figsize=(10,6))
sns.boxplot(x="Subject",y="Score",hue="parental_level_of_education",data=parent_edu_scores,hue_order=parent_edu_labels)
plt.title("Parental Education-wise Scores in Different Subjects")
plt.xlabel("Subject")
plt.ylabel("Score")
plt.legend(title="Parental Education",loc="lower left",ncol=3)
# plt.ylim(-30,120)
plt.show()
# Create a correlation matrix
# Does not make sense to calculate correlation between categorical variables
corelation_data = data.drop(columns=["reading_score","writing_score","math_score","total_score"])
correlation = corelation_data.corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation,annot=True,cmap="viridis")
plt.title("Correlation Matrix")
plt.show()
# Correlation between scores
score_correlation = data[test_list].drop(columns="average_score").corr()
plt.figure(figsize=(10,6))
sns.heatmap(score_correlation,annot=True,cmap="viridis")
plt.title("Correlation between Scores")
plt.show()
#Checking the correlation between features and individual scores
# Create a correlation matrix
test_list = ["math_score", "reading_score", "writing_score"]
def individual_score_correlation(data, checked_test):
correlation_data = data.drop(columns=checked_test).corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_data, annot=True, cmap="viridis")
plt.title("Correlation Matrix")
for subject in test_list:
tests = ["math_score", "reading_score", "writing_score", "average_score","total_score"]
tests.remove(subject)
individual_score_correlation(data, tests)
plt.show()
from scipy.stats import ttest_ind
print("*"*65)
print(" "*23+"T-Tests for Gender")
print("*"*65,end="\n")
for subject in test_list:
male_scores = genderwise_scores[genderwise_scores["gender"]=="Male"]
male_math_scores = male_scores[male_scores["Subject"]==subject]["Score"]
female_scores = genderwise_scores[genderwise_scores["gender"]=="Female"]
female_math_scores = female_scores[female_scores["Subject"]==subject]["Score"]
# Perform t-test
t_statistics, p_values = ttest_ind(male_math_scores,female_math_scores)
print(f"Significance Difference between Male and Female for {subject}")
print("="*65)
print("T statistics:", t_statistics)
print("P values:", p_values)
#Using a significance level of 0.05 ==> Confidence level of 95%
if p_values < 0.05:
print("Null hypothesis rejected. There is a significant difference\n")
print("\n"+"*"*75)
print(" "*25+"T-Tests for Lunch Type")
print("*"*75, end="\n"*2)
for subject in test_list:
standard_scores = lunchwise_scores[lunchwise_scores["lunch"]=="Standard"]
standard_math_scores = standard_scores[standard_scores["Subject"]==subject]["Score"]
free_scores = lunchwise_scores[lunchwise_scores["lunch"]=="Free/Reduced"]
free_math_scores = free_scores[free_scores["Subject"]==subject]["Score"]
# Perform t-test
t_statistics, p_values = ttest_ind(standard_math_scores,free_math_scores)
print(f"Significance Difference between Standard and Free/Reduced for {subject}")
print("="*75)
print("T statistics:", t_statistics)
print("P values:", p_values)
#Using a significance level of 0.05 ==> Confidence level of 95%
if p_values < 0.05:
print("Null hypothesis rejected. There is a significant difference\n")
from scipy.stats import f_oneway
# Performing one-way ANOVA for test parental education level
print("*"*75)
print(" "*15+"One-Way ANOVA for Parental Education Level")
print("*"*75, end="\n"*2)
for subject in test_list:
edu_scores = parent_edu_scores[parent_edu_scores["Subject"]==subject]
edu_scores = [edu_scores[edu_scores["parental_level_of_education"]==edu]["Score"] for edu in parent_edu_labels]
# Perform one-way ANOVA
f_statistics, p_values = f_oneway(*edu_scores)
print(f"Significance Difference between Parental Education Levels for {subject}")
print("="*75)
print("F statistics:", f_statistics)
print("P values:", p_values)
#Using a significance level of 0.05 ==> Confidence level of 95%
if p_values < 0.05:
print("Null hypothesis rejected. There is a significant difference\n")
# Performing one-way ANOVA for different ethnicities
print("*"*75)
print(" "*15+"One-Way ANOVA for Ethnicities")
print("*"*75, end="\n"*2)
for subject in test_list:
ethnicity_scores = racewise_scores[racewise_scores["Subject"]==subject]
ethnicity_scores = [ethnicity_scores[ethnicity_scores["race_ethnicity"]==race]["Score"] for race in race_labels]
# Perform one-way ANOVA
f_statistics, p_values = f_oneway(*ethnicity_scores)
print(f"Significance Difference between Ethnicities for {subject}")
print("="*75)
print("F statistics:", f_statistics)
print("P values:", p_values)
#Using a significance level of 0.05 ==> Confidence level of 95%
if p_values < 0.05:
print("Null hypothesis rejected. There is a significant difference\n")
The demographic and background features will be definetly used as features for our model.
Defined Input Features: Select demographic and background features (lunch, test prep, gender, ethnicity, parental education) as inputs for modeling.
x_features = data.drop(columns=["math_score","reading_score","writing_score","total_score","average_score"])
y_target = data[["math_score","reading_score","writing_score"]]
print(f"="*35+ " Features " +"="*35)
x_features.astype("category").describe()
print(f"="*13+ " Potential Targets " +"="*13)
y_target.describe()
import plotly
# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()
# Configure the trace.
trace = go.Scatter3d(
x= data["math_score"],
y=data["reading_score"],
z=data["writing_score"],
mode='markers',
marker={
'size': 10,
'opacity': 0.8,
'color': 'blue',
}
)
trace.name = 'Initial Data'
# Configure the layout.
layout = go.Layout(
margin={'l': 50, 'r': 0, 'b': 0, 't':30},
scene=dict(xaxis=dict(title=dict(text='Math Score'),color='red'),
yaxis=dict(title=dict(text='Reading Score'),color='red'),
zaxis=dict(title=dict(text='Writing Score'),color='red')
),
showlegend=True
)
data2 = [trace]
plot_figure = go.Figure(data=data2, layout=layout)
plot_figure.update_layout(title_font_color='red',title_font_size=20)
plot_figure.update_layout(title="3D Scatter Plot of Scores",paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)',font=dict(color='red'))
plot_figure.update_layout(font_family="Courier New",font_size=13)
plot_figure.update_layout(legend=dict(font=dict(size=20),title=dict(text="Data Type",font=dict(size=20))))
# Render the plot.
plotly.offline.iplot(plot_figure)
Q1 = y_target.quantile(0.25)
Q3 = y_target.quantile(0.75)
IQR = Q3 - Q1
threshold = 2.0
data_outliers = y_target[
((y_target["math_score"] < (Q1["math_score"] - threshold * IQR["math_score"])) |
(y_target["math_score"] > (Q3["math_score"] + threshold * IQR["math_score"])) |
(y_target["reading_score"] < (Q1["reading_score"] -threshold * IQR["reading_score"])) |
(y_target["reading_score"] > (Q3["reading_score"] +threshold * IQR["reading_score"])) |
(y_target["writing_score"] < (Q1["writing_score"] -threshold * IQR["writing_score"])) |
(y_target["writing_score"] > (Q3["writing_score"] +threshold * IQR["writing_score"])))
]
print("Detected Outliers:")
data_outliers
trace2 = go.Scatter3d(
x= data_outliers["math_score"],
y=data_outliers["reading_score"],
z=data_outliers["writing_score"],
mode='markers',
marker={
'size': 10,
'opacity': 0.9,
'color': "red"
}
)
trace2.name = 'Detected Outliers'
data2 = [trace,trace2]
plot_figure = go.Figure(data=data2, layout=layout)
# Render the plot.
plot_figure.update_layout(title_font_color='red',title_font_size=20)
plot_figure.update_layout(title="3D Scatter Plot of Scores",paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)',autosize=True,font=dict(color='red'))
plot_figure.update_layout(font_family="Courier New",font_size=13)
plot_figure.update_layout(legend=dict(font=dict(size=20),title=dict(text="Data Type",font=dict(size=20))))
# Render the plot.
plotly.offline.iplot(plot_figure)
lower_bound = Q1 - threshold * IQR
upper_bound = Q3 + threshold * IQR
filtered_data = y_target.copy()
for subject in test_list:
filtered_data[filtered_data[subject]<lower_bound[subject]] = lower_bound[subject]
filtered_data[filtered_data[subject]>upper_bound[subject]] = upper_bound[subject]
trace3 = go.Scatter3d(
x= filtered_data["math_score"],
y=filtered_data["reading_score"],
z=filtered_data["writing_score"],
mode='markers',
marker={
'size': 10,
'opacity': .5,
'color': "red"
}
)
trace3.name = 'Data with Outlier Treatment'
trace.marker.opacity = 0.5
data3 = [trace,trace3]
plot_figure = go.Figure(data=data3, layout=layout)
# Render the plot.
plot_figure.update_layout(title_font_color='red',title_font_size=20)
plot_figure.update_layout(title="3D Scatter Plot of Scores",paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)',autosize=True,font=dict(color='red'))
plot_figure.update_layout(font_family="Courier New",font_size=12)
plot_figure.update_layout(legend=dict(font=dict(size=20),title=dict(text="Data Type",font=dict(size=20))))
fig.update_layout(title=dict(font=dict(size=40), yref='paper'))
# Render the plot.
plotly.offline.iplot(plot_figure)
# Create histogram for `filtered_data` (data without outliers)
hist_filtered = go.Histogram(
x=filtered_data["math_score"],
nbinsx=20,
opacity=0.5,
name='Math Score (Filtered)',
marker=dict(color='blue'),
histnorm="probability"
)
# Create histogram for `data` (data with outliers)
hist_outliers = go.Histogram(
x=data["math_score"],
nbinsx=20,
opacity=0.5,
name='Math Score with Outliers',
marker=dict(color='red'),
histnorm="probability"
)
# Combine both histograms into a single figure
fig = go.Figure(data=[hist_filtered, hist_outliers])
# Set title and axis labels
fig.update_layout(
title="Math Score Distribution with and without Outliers",
xaxis_title="Math Score",
yaxis_title="Probability",
barmode='overlay' # Overlay both histograms
)
fig.update_layout(title=dict(font=dict(size=30), yref='paper'))
# Show the plot
fig.show()
# Create histogram for `filtered_data` (data without outliers)
hist_filtered = go.Histogram(
x=filtered_data["reading_score"],
nbinsx=20,
opacity=0.5,
name='Reading Score (Filtered)',
marker=dict(color='blue'),
histnorm="probability"
)
# Create histogram for `data` (data with outliers)
hist_outliers = go.Histogram(
x=data["reading_score"],
nbinsx=20,
opacity=0.5,
name='Reading Score with Outliers',
marker=dict(color='red'),
histnorm="probability"
)
# Combine both histograms into a single figure
fig = go.Figure(data=[hist_filtered, hist_outliers])
# Set title and axis labels
fig.update_layout(
title="Reading Score Distribution",
xaxis_title="Reading Score",
yaxis_title="Probability",
barmode='overlay' # Overlay both histograms
)
fig.update_layout(font_size=14)
fig.update_layout(title=dict(text="Reading Score Distributions", font=dict(size=30), yref='paper'))
# Show the plot
fig.show()
# Create histogram for `filtered_data` (data without outliers)
hist_filtered = go.Histogram(
x=filtered_data["writing_score"],
nbinsx=20,
opacity=0.5,
name='Writing Score (Filtered)',
marker=dict(color='blue'),
histnorm="probability"
)
# Create histogram for `data` (data with outliers)
hist_outliers = go.Histogram(
x=data["writing_score"],
nbinsx=20,
opacity=0.5,
name='Writing Score with Outliers',
marker=dict(color='red'),
histnorm="probability"
)
# Combine both histograms into a single figure
fig = go.Figure(data=[hist_filtered, hist_outliers])
# Set title and axis labels
fig.update_layout(
title="Writing Score Distribution",
xaxis_title="Writing Score",
yaxis_title="Probability",
barmode='overlay' # Overlay both histograms
)
fig.update_layout(font_size=14)
fig.update_layout(title=dict(font=dict(size=30), yref='paper'))
# Show the plot
fig.show()
Features to be used: All of the columns for demographics and background will be included in order to predict the desired test scores.
Optional Prior Test Inclusion: Incorporate a known test score (math, reading, or writing) to improve the accuracy of predicting the other scores. This approach leverages prior knowledge for enhanced prediction.
# Select if you have a prior test result (math, reading, writing, none).
# If you have a prior test result then the rest will be predicted based on that.
# If you do not have a prior test result then you will be asked to take a test.
already_taken_test = "reading" # "math", "reading", "writing", "none"
X = x_features.copy()
tests_to_predict = ["math_score","reading_score","writing_score"]
if already_taken_test.lower() == "reading":
X["reading_score"] = y_target["reading_score"]
tests_to_predict.remove("reading_score")
print("Reading score has been taken. Predicting Math and Writing scores")
elif already_taken_test.lower() == "writing":
X["writing_score"] = y_target["writing_score"]
tests_to_predict.remove("writing_score")
print("Writing score has been taken. Predicting Math and Reading scores")
elif already_taken_test.lower() == "math":
X["math_score"] = y_target["math_score"]
tests_to_predict.remove("math_score")
print("Math score has been taken. Predicting Reading and Writing scores")
elif already_taken_test.lower() == "none":
print("No test has been taken yet. Predicting all scores")
else:
print("Invalid test type. Please enter math, reading, writing or None")
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
predicted_values = []
labels_values = []
best_model_list = []
# Hyperparameter grid for GridSearchCV
param_grid = {
'n_estimators': [200, 400, 600, 800, 1000],
'max_depth': [5, 7, 9],
}
for subject in tests_to_predict:
# Define features and target
y = filtered_data[subject] # Target column
# One-hot encode categorical features
one_hot_encoded_data = pd.get_dummies(X)
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded_data, y, test_size=0.2, random_state=100)
#=========================================================================
# Hyperparameter tuning and model training
#=========================================================================
# Initialize the model
model = RandomForestRegressor(random_state=100)
# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
# Best model from grid search
best_model = grid_search.best_estimator_
# Train the best model
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)
# Append the best model, the predictions and labels to a list
predicted_values.append(y_train_pred)
labels_values.append(y_train)
best_model_list.append(best_model)
#=========================================================================
# Evaluate the model
#=========================================================================
# Calculate Mean Squared Error (RMSE)
test_rmse_error = (mean_squared_error(y_test, y_pred))**0.5
train_rmse_error = (mean_squared_error(y_train, y_train_pred))**0.5
# Calculate R2 Score
r2_score_value = r2_score(y_test, y_pred)
r2_score_train = r2_score(y_train, y_train_pred)
# Print results
print("\n"+"="*70)
print(f"Subject: {subject}")
print("="*70)
print(f"Test Root Mean Squared Error: \t{test_rmse_error}")
print(f"Train Root Mean Squared Error: \t{train_rmse_error}")
print(f"Test R2 Score: \t{r2_score_value}")
print(f"Train R2 Score: \t{r2_score_train}\n")
# Feature importance analysis
feature_importance = best_model.feature_importances_
important_features = pd.DataFrame({
'Feature': one_hot_encoded_data.columns,
'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)
print("Top 10 Features Based on Importance:")
print(important_features.head(10))
print("\n")
# Cross-validation to get an estimate of model performance
cv_scores = cross_val_score(best_model, one_hot_encoded_data, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated Mean Squared Error: {(-cv_scores.mean())**0.5}")
print(f"Best Parameters: {grid_search.best_params_}")
print("="*70+"\n")
predicted_values = np.array(predicted_values).T
labels_values = np.array(labels_values).T
predicted_values = pd.DataFrame(predicted_values, columns=tests_to_predict)
labels_values = pd.DataFrame(labels_values, columns=tests_to_predict)
x_ideal= np.arange(0,100)
y_ideal=x_ideal
for subject in tests_to_predict:
# print(subject)
try:
title_string = f"{subject.replace('_',' ').title()} Predicted vs Actual Scores"
except:
title_string = f"{subject.title()} Predicted vs Actual Scores"
plt.figure(figsize=(10,6))
plt.title(f"{title_string}")
plt.plot(x_ideal,y_ideal,color="red",label="Ideal Line")
plt.scatter(labels_values[subject],predicted_values[subject],color="blue",label="Math Score")
plt.xlabel("Actual Scores")
plt.ylabel("Predicted Scores")
plt.show()
Challenging to Predict Scores with Current Features: Predicting math, reading, and writing scores using only demographic features (lunch, test prep, ethnicity, parent education, and gender) is challenging and lacks accuracy.
One Known Score Improves Prediction of Others: Knowing one test score (e.g., math) significantly enhances the accuracy of predicting the other two scores (reading and writing).
Best Predictor for Performance Across Tests: A single known test score is the best indicator of a student’s performance in the other subjects. The code can be adjusted to experiment with different known test scores as inputs, setting a prior score for one subject or leaving it blank to simulate no prior knowledge.
Key Features for Predicting Math Scores: For math scores, the most important predictors are a prior test score and gender.
High Importance of Reading/Writing Scores for Each Other: When predicting reading scores, knowing the writing score is highly predictive (about 90% importance), and vice versa.